#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup filenames

filename <- "Costa Rica_Public Use" # !!!Update filename
functions_vers <-  "functions_1.6.R" # !!!Update helper functions file

#' #Setup data, functions and create dictionary for dataset review
source (functions_vers)
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#' #Direct PII: variables to be removed
# !!!No Direct PII

#' #Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects
#' !!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables="b_entrevistador")

#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#'  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("districtnum") 
mydata <- encode_location (variables= locvars, missing=999999)

#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" of 10 or less. 

break_edu <- c(1,2,3,4,5,7,777,888)
labels_edu <- c("1. None" =1, 
                "2. Don't know" =2, 
                "3. Primary school" =3, 
                "4. Secondary school" =4, 
                "5. Technical or vocational school or University" =5)
mydata <- ordinal_recode (variable="hhheadeduc", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="e_hhheadeduc", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(0,1,2,3,5)
labels_edu <- c("0. None" =1, 
                "1. Completed primary" =2, 
                "2. Completed secondary" =3, 
                "3. Completed Technical or University" =4, 
                "4. Don't Know" =5)
mydata <- ordinal_recode (variable="e_p8", break_points=break_edu, missing=999999, value_labels=labels_edu)

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("b_hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("e_hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("b_q6", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("b_q8", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("e_p4", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
mydata <- top_recode ("e_p6", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 

# Dates of birth removed, as strong identifier and ages already provided in separate variables

mydata <- mydata[!names(mydata) %in% c("b_q3a", "b_q3b", "b_q3c")]

# Combine technical and university education of head of household 

b_headtech_univ <- ifelse(mydata$b_headtech==1, 1,
                          ifelse(mydata$b_headuniv==1, 1,0))
mydata <- add_column(mydata, b_headtech_univ, .after = "b_headsec")

e_headtech_univ <- ifelse(mydata$e_headtech==1, 1,
                          ifelse(mydata$e_headuniv==1, 1,0))
mydata <- add_column(mydata, e_headtech_univ, .after = "e_headsec")

mydata <- mydata[!names(mydata) %in% c("b_headtech", "b_headuniv", "e_headtech", "e_headuniv")]

mydata$b_q10[mydata$b_q10=="Tecnico"] <- "Tecnico/Universidad"
mydata$b_q10[mydata$b_q10=="Universidad"] <- "Tecnico/Universidad"

# Encode school names

mydata <- encode_location (variables= c("b_q23", "e_p23ot"), missing=NA)

# Recode "apartamento" into "otro". 

mydata$b_q45[mydata$b_q45=="Apartamento"] <- "Otro"

# !!!Include relevant variables in list below

indirect_PII <- c("hhhead", 
                  "hhheadgrand", 
                  "hhheadsib", 
                  "hhheadauntuncle", 
                  "hhheadnon", 
                  "hhheadself", 
                  "attend", 
                  "attend90", 
                  "q36c", 
                  "q36f", 
                  "b_schdrop_bl", 
                  "b_q7", 
                  "b_q9", 
                  "b_q10", 
                  "b_q12", 
                  "b_q20", 
                  "b_q23", 
                  "b_q30", 
                  "b_q34", 
                  "b_q39", 
                  "b_q45", 
                  "b_q46", 
                  "e_p23ot")

capture_tables (indirect_PII)

# Recode those with very specific values where more than half of the sample have actual data. 

mydata <- encode_direct_PII_team (variables="b_schdrop_bl")

#' #Matching and crosstabulations: Run automated PII check 

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('male', 'hhheadeduc', 'ageyears') ##!!! Replace with candidate categorical demo vars

# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

#' Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH

hheduc <- c("hhheadeduc",
           "b_hhheadeduc",
           "b_headnoeduc",
           "b_headdkeduc",
           "b_headprim",
           "b_headsec",
           "b_headtech",
           "b_headuniv",
           "e_hhheadeduc",
           "e_headnoeduc",
           "e_headdkeduc",
           "e_headprim",
           "e_headsec",
           "e_headtech",
           "e_headuniv")
mydata [notAnon, hheduc] <- NA
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("b_q30oth")
report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata <- mydata[!names(mydata) %in% "b_q30oth"] # Removed, verbatim response in Spanish with high reidentification risk

#' #GPS data: Displace
# !!! No GPS

#' #Save processed data in Stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
